import os
import pandas as pd
import numpy as np
import logging
LOGGER = logging.getLogger(__name__)
from init import PATHS


def aggregate_at_familylevel(namefile):
    LOGGER.info('Aggregating')
    df_fam1 = AggregatingBasicVariables(namefile)
    df_fam2 = AggregatingCountriesAndGranted(namefile)
    df_fam3 = AggregatingCitations(namefile)
    df_fam = pd.concat([df_fam1, df_fam2, df_fam3], axis=1)
    df_fam = df_fam.reset_index()
    df_fam.to_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_{}.csv'.format(namefile), index=False)
    return


def aggregate_at_familylevel_short(namefile):
    LOGGER.info('Aggregating')
    df_appinfo = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Patents_of_{}.csv'.format(namefile))
    df_fam = pd.DataFrame(df_appinfo.groupby('docdb_family_id')['appln_id'].count().rename('nbr_app_in_fam'))
    df_appinfo['earliest_filing_date'] = df_appinfo['earliest_filing_date'].replace('9999-12-31 00:00:00', '2030-01-01 00:00:00')
    df_appinfo['earliest_filing_date'] = df_appinfo['earliest_filing_date'].replace('9999-12-31', '2030-01-01 00:00:00')
    df_appinfo['earliest_filing_date'] = pd.to_datetime(df_appinfo['earliest_filing_date'])
    df_fam['earliest_filing_date'] = df_appinfo.groupby('docdb_family_id')['earliest_filing_date'].agg([np.min])
    df_fam['earliest_filing_year'] = df_appinfo.groupby('docdb_family_id')['earliest_filing_year'].agg([np.min])
    df_fam['appln_filing_date'] = df_appinfo.groupby('docdb_family_id')['appln_filing_date'].agg([np.min])
    df_fam = GenerateGranted(df_fam, df_appinfo)
    list_allcountries, list_oecdmembers, list_nonOECD, list_nonOECDbigpatentingcountries = Getlist_countries()
    df_fam = GenerateCountries(df_fam, df_appinfo, list_allcountries, list_oecdmembers, list_nonOECD)
    df_fam = AddCitations_3years(df_fam, df_appinfo)
    df_fam.to_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_{}.csv'.format(namefile), index=False)
    return


def ImportFamilies( coltoaggregate, namefile):
    df_appinfo = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Patents_of_{}.csv'.format(namefile), usecols=coltoaggregate)
    return df_appinfo


def AggregatingBasicVariables(namefile):
    LOGGER.info('Aggregating Basic Variables')
    coltoaggregate = ['docdb_family_id', 'appln_id', 'docdb_family_size', 'nb_applicants', 'nb_inventors',
                      'earliest_filing_date', 'earliest_filing_year', 'earliest_filing_id',
                      'appln_filing_date', 'inpadoc_family_id']
    df_appinfo = ImportFamilies( coltoaggregate, namefile)
    df_fam = pd.DataFrame(df_appinfo.groupby('docdb_family_id')['appln_id'].nunique().rename('nbr_app_in_fam'))
    df_fam['docdb_family_size'] = df_appinfo.groupby('docdb_family_id')['docdb_family_size'].agg([np.mean])
    df_fam['nb_applicants'] = df_appinfo.groupby('docdb_family_id')['nb_applicants'].max()
    df_fam['nb_inventors'] = df_appinfo.groupby('docdb_family_id')['nb_inventors'].max()
    df_fam = GenerateEarliestFiling(df_fam, df_appinfo)
    df_fam = GenerateApplicationFiling(df_fam, df_appinfo)
    df_fam = GenerateInpadocFamId(df_fam, df_appinfo)
    return df_fam


def AggregatingCountriesAndGranted(namefile):
    LOGGER.info('Aggregating Countries And Granted')
    coltoaggregate = ['docdb_family_id', 'appln_id', 'appln_auth', 'granted']
    df_appinfo = ImportFamilies( coltoaggregate, namefile)
    df_fam = pd.DataFrame(df_appinfo.groupby('docdb_family_id')['appln_id'].count().rename('nbr_app_in_fam'))
    list_allcountries, list_oecdmembers, list_nonOECD, list_nonOECDbigpatentingcountries = Getlist_countries()
    df_fam = GenerateCountries(df_fam, df_appinfo, list_allcountries, list_oecdmembers, list_nonOECD)
    df_fam = GenerateCountsperCountries(df_fam, df_appinfo, list_oecdmembers, list_nonOECDbigpatentingcountries)
    df_fam = GenerateGranted(df_fam, df_appinfo)
    del df_fam['nbr_app_in_fam']
    return df_fam


def AggregatingCitations(namefile):
    LOGGER.info('Aggregating citation counts')
    coltoaggregate = ['docdb_family_id', 'appln_id', 'nb_citing_docdb_fam']
    df_appinfo = ImportFamilies( coltoaggregate, namefile)
    df_fam = pd.DataFrame(df_appinfo.groupby('docdb_family_id')['appln_id'].count().rename('nbr_app_in_fam'))
    df_fam['nb_citing_docdb_fam'] = df_appinfo.groupby('docdb_family_id')['nb_citing_docdb_fam'].agg([np.mean])
    df_cit = pd.read_csv(PATHS.citations / 'docdbid_citationcounts_fromapplndata.csv')
    df_cit = df_cit.rename(columns={'cited_docdb_family_id': 'docdb_family_id'})
    df_cit = df_cit.rename(columns={'within_1yr': 'nb_citing_docdbfam_w1y'})
    df_cit = df_cit.rename(columns={'within_3yr': 'nb_citing_docdbfam_w3y'})
    df_cit = df_cit.rename(columns={'within_4yr': 'nb_citing_docdbfam_w4y'})
    df_cit = df_cit.rename(columns={'within_5yr': 'nb_citing_docdbfam_w5y'})
    df_fam = df_fam.merge(df_cit, on='docdb_family_id', how='left')
    # a docdb family is present in df_cit only if it has ever been cited. Those never cited don't appear. These show up as missing values after the merge.
    # Hence, below I input zeros for all the families that were not present in df_cit and for which the variable nb_citing_docdb_fam was already saying no citation
    mask1 = df_fam['nb_citing_docdb_fam'] == 0
    colnames = ['nb_citing_docdbfam_w1y', 'nb_citing_docdbfam_w3y', 'nb_citing_docdbfam_w4y', 'nb_citing_docdbfam_w5y']
    for col in colnames:
        df_fam[col].mask(mask1 & (df_fam[col].isnull()), 0, inplace=True)
    del df_fam['nbr_app_in_fam']
    df_fam.index = df_fam['docdb_family_id']
    del df_fam['docdb_family_id']
    return df_fam


def GenerateEarliestFiling(df_fam, df_appinfo):
    LOGGER.info('       Generate Earliest Filing')
    # the below replacement are needed to be able to convert  the variable as pd.datetime. the 9999 causes an error.
    df_appinfo['earliest_filing_date'] = df_appinfo['earliest_filing_date'].replace('9999-12-31 00:00:00', '2030-01-01 00:00:00')
    df_appinfo['earliest_filing_date'] = df_appinfo['earliest_filing_date'].replace('9999-12-31', '2030-01-01 00:00:00')
    df_appinfo['earliest_filing_date'] = pd.to_datetime(df_appinfo['earliest_filing_date'])
    df_fam['earliest_filing_date'] = df_appinfo.groupby('docdb_family_id')['earliest_filing_date'].agg([np.min])
    # keep earliest of the earliest_filing_year
    df_fam['earliest_filing_year'] = df_appinfo.groupby('docdb_family_id')['earliest_filing_year'].agg([np.min])
    df_fam = df_fam.reset_index().merge(df_appinfo[['docdb_family_id', 'earliest_filing_date', 'earliest_filing_id']].drop_duplicates(), on=['docdb_family_id', 'earliest_filing_date'], how='left')
    df_fam = df_fam.set_index('docdb_family_id')
    # some app have exactly the same earliest_filing_date but different earliest_filing_id
    # keep the smallest
    df_fam['earliest_filing_id'] = df_fam.reset_index().groupby('docdb_family_id')['earliest_filing_id'].min()
    df_fam = df_fam.reset_index()
    df_fam = df_fam.drop_duplicates()
    df_fam = df_fam.set_index('docdb_family_id')
    return df_fam


def GenerateApplicationFiling(df_fam, df_appinfo):
    LOGGER.info('       Generate Application Filing')
    df_appinfo['appln_filing_date'] = df_appinfo['appln_filing_date'].replace('9999-12-31 00:00:00', '2030-01-01 00:00:00')
    df_appinfo['appln_filing_date'] = df_appinfo['appln_filing_date'].replace('9999-12-31', '2030-01-01 00:00:00')
    df_appinfo['appln_filing_date'] = pd.to_datetime(df_appinfo['appln_filing_date'])
    df_fam['appln_filing_date'] = df_appinfo.groupby('docdb_family_id')['appln_filing_date'].min()
    return df_fam


def GenerateInpadocFamId(df_fam, df_appinfo):
    LOGGER.info('       Generate inpadoc_family_id')
    inpadoc_family_id = df_appinfo[['docdb_family_id', 'inpadoc_family_id']].drop_duplicates()
    inpadoc_family_id = inpadoc_family_id.groupby('docdb_family_id')['inpadoc_family_id'].agg(lambda x: ','.join([str(i) for i in list(x)]))
    df_fam = df_fam.reset_index().merge(inpadoc_family_id.reset_index(), on='docdb_family_id', how='left')
    df_fam = df_fam.set_index('docdb_family_id')
    return df_fam


def Getlist_countries():
    # IMPORT LIST OECD COUNTRIES FROM PATSTAT
    Countries = pd.read_csv(PATHS.patstatglobal / 'tls801_part01.csv', dtype=str, sep=',')
    LOGGER.info('       Imported Countries: tls801_part01.csv')
    list_oecdmembers = Countries[Countries['oecd_member'] == 'Y']['ctry_code'].drop_duplicates().tolist()
    LOGGER.info('       {} OECD countries'.format(len(list_oecdmembers)))
    list_allcountries = Countries[Countries['organisation_flag'] != 'Y']['ctry_code'].tolist()
    list_nonOECD = Countries[Countries['oecd_member'] != 'Y']['ctry_code'].drop_duplicates().tolist()
    list_nonOECD = [i for i in list_nonOECD if i not in [np.nan, 'EP', 'WO', ' ', '  ']]
    list_nonOECDbigpatentingcountries = ['EP', 'WO', 'CN', 'TW', 'BR', 'RU', 'ZA', 'SG', 'HK', 'MY', 'IN', 'EA']
    return list_allcountries, list_oecdmembers, list_nonOECD, list_nonOECDbigpatentingcountries


def GenerateCountries(df_fam, df_appinfo, list_allcountries, list_oecdmembers, list_nonOECD):
    LOGGER.info('       Generate Countries List')
    df_fam['Countries'] = df_appinfo.groupby('docdb_family_id')['appln_auth'].agg(lambda x: ','.join([i for i in x.unique()]))
    LOGGER.info('       Generate NbrCountries')
    df_fam['NbrAuth'] = df_fam['Countries'].apply(lambda x: len(x.split(',')) if type(x) == str else 0)
    df_fam['NbrCountries'] = df_fam['Countries'].apply(lambda x: len([i for i in x.split(',') if i in list_allcountries]) if type(x) == str else 0)
    LOGGER.info('       Generate NbrOECDcountries')
    df_fam['NbrOECDcountries'] = df_fam['Countries'].apply(lambda x: len([i for i in x.split(',') if i in list_oecdmembers]) if type(x) == str else 0)
    LOGGER.info('       Generate NbrNonOECDcountries')
    df_fam['NbrNonOECDcountries'] = df_fam['Countries'].apply(lambda x: len([i for i in x.split(',') if i in list_nonOECD]) if type(x) == str else 0)
    LOGGER.info('       Generate ifTriadic')
    df_fam['ifTriadic'] = df_fam['Countries'].apply(lambda x: checkTriadic(x))
    return df_fam


def checkTriadic(x):
    if type(x) == str:
        x = x.split(',')
        if ('EP' in x) & ('JP' in x) & ('US' in x):
            return True
        else:
            return False
    else:
        return np.nan


def GenerateCountsperCountries(df_fam, df_appinfo, list_oecdmembers, list_nonOECDbigpatentingcountries):
    LOGGER.info('       Generate Counts per Countries')
    df_famcountries = df_appinfo.groupby(['docdb_family_id', 'appln_auth'])['appln_id'].count().reset_index()
    df_famcountries = df_famcountries.set_index('docdb_family_id')
    list_country_col = list_oecdmembers + list_nonOECDbigpatentingcountries
    list_country_col = [i for i in list_country_col if i in list(set(df_famcountries['appln_auth']))]
    LOGGER.info('       Nbr Countries var to be created {}'.format(len(list_country_col)))
    LOGGER.info('       List Countries {}'.format(list_country_col))
    for co in list_country_col:
        df_fam[co] = df_famcountries[df_famcountries['appln_auth'] == co]['appln_id']
    return df_fam


def GenerateGranted(df_fam, df_appinfo):
    LOGGER.info('       Generate Granted')
    df_fam['Granted'] = df_appinfo.groupby('docdb_family_id')['granted'].agg(lambda x: True if 'Y' in x.unique() else False)
    mask = df_appinfo['appln_auth'] == 'US'
    df_fam['Granted_US'] = df_appinfo[mask].groupby('docdb_family_id')['granted'].agg(lambda x: True if 'Y' in x.unique() else False)
    mask = df_appinfo['appln_auth'] == 'EP'
    df_fam['Granted_EP'] = df_appinfo[mask].groupby('docdb_family_id')['granted'].agg(lambda x: True if 'Y' in x.unique() else False)
    mask = df_appinfo['appln_auth'] == 'JP'
    df_fam['Granted_JP'] = df_appinfo[mask].groupby('docdb_family_id')['granted'].agg(lambda x: True if 'Y' in x.unique() else False)
    return df_fam


def AddCitations_3years(df_fam, df_appinfo):
    df_fam['nb_citing_docdb_fam'] = df_appinfo.groupby('docdb_family_id')['nb_citing_docdb_fam'].agg([np.mean])
    cols = ['cited_docdb_family_id', 'within_3yr']
    df_cit = pd.read_csv(PATHS.citations / 'docdbid_citationcounts_fromapplndata.csv', usecols=cols)
    df_cit = df_cit.rename(columns={'cited_docdb_family_id': 'docdb_family_id'})
    df_cit = df_cit.rename(columns={'within_3yr': 'nb_citing_docdbfam_w3y'})
    df_fam = df_fam.merge(df_cit, on='docdb_family_id', how='left')
    mask1 = df_fam['nb_citing_docdb_fam'] == 0
    df_fam['nb_citing_docdbfam_w3y'].mask(mask1 & (df_fam['nb_citing_docdbfam_w3y'].isnull()), 0, inplace=True)
    return df_fam
